"""
爬取豆瓣电影排行top250中的排名，名称，评分
地址：https://movie.douban.com/top250
"""
import time
import requests
from lxml import etree

url = 'https://movie.douban.com/top250'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36 Edg/90.0.818.39'
}


def spider_douban250(url):
    print('正在抓取{}'.format(url))
    r = requests.get(url, headers=headers)
    t = etree.HTML(r.text)
    rank_num = t.xpath('//div[@class="item"]/div[1]/em/text()')
    rank_img = t.xpath('//div[@class="item"]/div[1]/a/img/@src')
    rank_title = t.xpath('//div[@class="item"]/div[2]/div[1]/a/span[1]/text()')
    rank_rating = t.xpath('//div[@class="item"]/div[2]/div[2]/div[1]/span[2]/text()')
    next_page = t.xpath('//span[@class="next"]/a/@href')
    return list(zip(rank_num, rank_img, rank_title, rank_rating)), next_page


rank_list = []

# 实现分页2
next_url = url
while next_url:
    rank_current_page, next_page = spider_douban250(next_url)
    rank_list += rank_current_page
    time.sleep(0.5)
    next_url = url + next_page[0] if next_page else None

# # 实现分页1
# for num in range(10):
#     page_url = url + f"?start={25 * num}&filter="
#     time.sleep(0.5)
#     rank_list += spider_douban250(page_url)
#

# 将250条结果转为dict
rank_dict_list = [dict(zip(("rank_num", "img", "title", "rating"), item)) for item in rank_list]
print(rank_dict_list)
